In this section we will look at descriptive statistics from covariates, and from the face PCA.

Preliminaries

Libraries

library(tidyverse)
library(GGally)
library(ggpubr)
library(ggridges)
library(ggrepel)
library(pophelper)
source("PlotFaces.R")
source("Distances.R")
source("PlotPCs.R")

Databases

setwd('..')
path <- getwd()
setwd(paste(path, "/Results/MergedData", sep = ""))
load("MergedDat.RData")
MergedDat[[1]]$Sex <- as.factor(MergedDat[[1]]$Sex)
MergedDat[[1]]$CS  <- log(MergedDat[[1]]$CS)

setwd(paste(path, "/Results/FacePCA", sep = ""))
face.eigenvalues  <- read_delim("eigenvalues.csv", delim = " ", col_names = FALSE)
face.eigenvectors <- read_csv("eigenvectors.csv", col_names = FALSE)
face.facets       <- read_csv("facets.csv", col_names = FALSE)
face.means        <- read_csv("means.csv", col_names = FALSE)

setwd(paste(path, "/Results/GenPCA", sep = ""))
gen.eigenvalues <- read_delim("PCA.eigenval", delim = " ", col_names = FALSE)

Descriptive and exploratory analyses

We can look at some summaries. We have a total of 3427 subjects with Genetic and Face data.

MergedDat[[1]] %>% dplyr::select(c("Age", "Height", "CS", "Weight", "BMI") ) %>%
  summarise_all(funs( mean(. , na.rm = T), sd(. , na.rm = T) )) %>% print()
## # A tibble: 1 x 10
##   Age_mean Height_mean CS_mean Weight_mean BMI_mean Age_sd Height_sd
##      <dbl>       <dbl>   <dbl>       <dbl>    <dbl>  <dbl>     <dbl>
## 1     27.9        169.    1.41        70.7     24.7   13.6      9.22
## # ... with 3 more variables: CS_sd <dbl>, Weight_sd <dbl>, BMI_sd <dbl>
MergedDat[[1]] %>% group_by(prediction, Sex) %>% dplyr::select(Age, Height, CS, Weight) %>% 
            summarise_all(funs(mean(. , na.rm = T), sd(. , na.rm = T))) %>% print()
## Adding missing grouping variables: `prediction`, `Sex`
## # A tibble: 10 x 10
## # Groups:   prediction [?]
##    prediction Sex    Age_mean Height_mean CS_mean Weight_mean Age_sd
##    <fct>      <fct>     <dbl>       <dbl>   <dbl>       <dbl>  <dbl>
##  1 AFR        Female     23.9        164.    1.41        73.1   9.47
##  2 AFR        Male       24.7        176.    1.42        83.2   9.27
##  3 AMR        Female     25.2        163.    1.40        61.9   9.72
##  4 AMR        Male       25.8        176.    1.42        77.4   9.45
##  5 EAS        Female     22.6        161.    1.40        56.3   5.23
##  6 EAS        Male       23.6        173.    1.42        71.3   8.66
##  7 EUR        Female     29.7        165.    1.40        66.2  15.1 
##  8 EUR        Male       28.8        178.    1.42        81.4  14.7 
##  9 SAS        Female     22.6        160.    1.40        58.6   7.46
## 10 SAS        Male       23.2        173.    1.42        74.4   6.87
## # ... with 3 more variables: Height_sd <dbl>, CS_sd <dbl>, Weight_sd <dbl>
sexbypop <- function(dat, var, y){
  p1 <- ggplot(dat, aes(interaction(Sex, prediction), var, colour = prediction)) +
    geom_boxplot() + 
    theme_pubr() + 
    labs(x = "Sex by population", colour = "Population", y = y) + 
    scale_x_discrete(labels=c())
  p1 <- ggpar(p1, palette = "jama")
  return(p1)
}

p1 <- sexbypop(MergedDat[[1]], MergedDat[[1]]$Height, "Height")
p2 <- sexbypop(MergedDat[[1]], MergedDat[[1]]$Age, "Age")
p3 <- sexbypop(MergedDat[[1]], MergedDat[[1]]$Weight, "Weight")
p4 <- sexbypop(MergedDat[[1]], MergedDat[[1]]$BMI, "BMI")
p5 <- sexbypop(MergedDat[[1]], MergedDat[[1]]$CS, "CS")

ggarrange(p1, p2, p3, p4, p5,
          common.legend = TRUE, legend = "right")
## Warning: Removed 145 rows containing non-finite values (stat_boxplot).

## Warning: Removed 145 rows containing non-finite values (stat_boxplot).
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
## Warning: Removed 152 rows containing non-finite values (stat_boxplot).
## Warning: Removed 156 rows containing non-finite values (stat_boxplot).
## Warning: Removed 104 rows containing non-finite values (stat_boxplot).

We can also explore some correlograms

p1 <- ggpairs(MergedDat[[1]], columns = c(3:8,10,11,60,61), ggplot2::aes(colour=Sex, alpha = 0.1), 
              lower = list(continuous = "smooth")) + theme_pubr()
p1 <- ggpar(p1, palette = "jama")
p1

Face PCA

PCA screeploy

barplot(c(as.matrix(face.eigenvalues)), 
       main = "Face PCA Variances",
       xlab = "Principal Components",
       ylab = "Eigenvalue",
       col = "steelblue")

In the case of the Face PCA, the first 10 PCs account for 83.9502428 percent of the total variance, while the first 4 PCs account for 66.1452966

We can use joy plots to compare the distributions of each face PC grouped by sex. Note that I’ll use the mahalanobis space instead of the euclidean one.

ts <- MergedDat[[1]] %>% select(starts_with("Face"))
p1 <- PlotPCs(ts, MergedDat[[1]]$Sex ,"Face_PC1", "Face_PC9")
p2 <- PlotPCs(ts, MergedDat[[1]]$Sex, "Face_PC10", "Face_PC19")
p3 <- PlotPCs(ts, MergedDat[[1]]$Sex, "Face_PC20", "Face_PC29")
p4 <- PlotPCs(ts, MergedDat[[1]]$Sex, "Face_PC30", "Face_PC39")
remove(ts)
ggarrange(p1, p2, p3, p4,
          common.legend = T, legend = "bottom")

We can also see the distribution in the first PCs of sexes by each continental group

plotsexes <- function(PCA, cont, x1, y1){
  highPCA <- filter(PCA, cont)
  backPCA <- filter(PCA, !cont)
  sexes   <- filter(MergedDat[[1]], cont)$Sex
  title   <- as.character(MergedDat[[1]]$prediction[cont][1])
  avgs    <- highPCA %>% select(x1, y1, Sex) %>% 
                  group_by(Sex) %>% summarise_all(funs(mean))
  p1 <- ggplot(highPCA, aes_string(x = x1, y = y1)) + 
            geom_point(aes(color = sexes)) +
            geom_point(data = avgs, size = 5, aes(color = Sex)) +
            geom_point(data = avgs, size = 3, color = "black") +
            geom_point(data = backPCA, alpha = 0.01, color = "black", aes_string(x = x1, y = y1)) + 
            theme_pubr() + 
            ggtitle(title)
  p1 <- ggpar(p1, palette = "jco")
  return(p1)
}

p1 <- plotsexes(MergedDat[[1]], MergedDat[[1]]$prediction == "AFR", "Face_PC1", "Face_PC2")
p2 <- plotsexes(MergedDat[[1]], MergedDat[[1]]$prediction == "AFR", "Face_PC3", "Face_PC4")
p3 <- plotsexes(MergedDat[[1]], MergedDat[[1]]$prediction == "AMR", "Face_PC1", "Face_PC2")
p4 <- plotsexes(MergedDat[[1]], MergedDat[[1]]$prediction == "AMR", "Face_PC3", "Face_PC4")
p5 <- plotsexes(MergedDat[[1]], MergedDat[[1]]$prediction == "EAS", "Face_PC1", "Face_PC2")
p6 <- plotsexes(MergedDat[[1]], MergedDat[[1]]$prediction == "EAS", "Face_PC3", "Face_PC4")
p7 <- plotsexes(MergedDat[[1]], MergedDat[[1]]$prediction == "SAS", "Face_PC1", "Face_PC2")
p8 <- plotsexes(MergedDat[[1]], MergedDat[[1]]$prediction == "SAS", "Face_PC3", "Face_PC4")
p9 <- plotsexes(MergedDat[[1]], MergedDat[[1]]$prediction == "EUR", "Face_PC1", "Face_PC2")
p10 <- plotsexes(MergedDat[[1]], MergedDat[[1]]$prediction == "EUR", "Face_PC3", "Face_PC4")

ggarrange(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, 
          ncol = 2, nrow = 5,
          common.legend = TRUE, legend = "bottom", 
          align = "v")

Genetic PCA

Similarly, we can look at the genetic PCs with respect to populations

PCA screeplot

barplot(c(as.matrix(gen.eigenvalues)), 
       main = "Genetic PCA Variances",
       xlab = "Principal Components",
       ylab = "Eigenvalue",
       col = "steelblue")

In the case of the Genetic PCA, the first 10 PCs account for no more than 85.8977371 percent of the total variance, while the first 4 PCs account for no more than 82.3729178.

We can use joy plots to compare the distributions of each genetic PC grouped by population.

source("PlotPCs.R")
ts <- MergedDat[[1]] %>% select(starts_with("Gen"))
p1 <- PlotPCs(ts, MergedDat[[1]]$prediction, "PC1", "PC9")
p2 <- PlotPCs(ts, MergedDat[[1]]$prediction, "PC10", "PC19")
p3 <- PlotPCs(ts, MergedDat[[1]]$prediction, "PC20", "PC29")
p4 <- PlotPCs(ts, MergedDat[[1]]$prediction, "PC30", "PC39")
ggarrange(p1, p2, p3, p4,
          common.legend = TRUE, legend = "right", 
          align = "v")

This is the scatter plot of the 1000 Genomes populations. These are the at the 1000Genomes populations:

POP SUPER POP Description
FIN EUR Finnish in Finland
CHS EAS Han Chinese South
GBR EUR British from England and Scotland
PUR AMR Puerto Rican in Puerto Rico
CLM AMR Colombian in Medellin, Colombia
MXL AMR Mexican Ancestry in Los Angeles, California
TSI EUR Toscani in Italia
LWK AFR Luhya in Webuye, Kenya
JPT EAS Japanese in Tokyo, Japan
IBS EUR Iberian populations in Spain
PEL AMR Peruvian in Lima, Peru
CDX EAS Chinese Dai in Xishuangbanna, China
YRI AFR Yoruba in Ibadan, Nigeria
KHV EAS Kinh in Ho Chi Minh City, Vietnam
ASW AFR African ancestry in Southwest USA
ACB AFR African Caribbean in Barbados
CHB EAS Han Chinese in Beijing, China
GIH SAS Gujarati Indians in Houston, Texas
GWD AFR Gambian in Western Division, The Gambia
PJL SAS Punjabi in Lahore,Pakistan
MSL AFR Mende in Sierra Leone
BEB SAS Bengali in Bangladesh
ESN AFR Esan in Nigeria
STU SAS Sri Lankan Tamil in the UK
ITU SAS Indian Telugu in the UK
CEU EUR Utah residents with Northern and Western European ancestry from the CEPH collection
plotoneg <- function(dat, x1, y1){
  p1 <- ggplot(dat, aes_string(x = x1, y = y1, color = "super_pop")) +
            geom_point(alpha = 0.3, size = 2) + labs(color = "Pop") +
            theme_pubr()
  p1 <- ggpar(p1, palette = "jama")
  return(p1)
}

p1 <- plotoneg(MergedDat[[2]], "PC1", "PC2")
p2 <- plotoneg(MergedDat[[2]], "PC3", "PC4")
p3 <- plotoneg(MergedDat[[2]], "PC5", "PC6")
p4 <- plotoneg(MergedDat[[2]], "PC7", "PC8")
p5 <- plotoneg(MergedDat[[2]], "PC9", "PC10")
p6 <- plotoneg(MergedDat[[2]], "PC11", "PC12")

ggarrange(p1, p2, p3, p4, p5, p6, 
          nrow = 3, ncol = 2,
          common.legend = TRUE, legend = "right", 
          align = "v")

We can plot our samples with the centroid of the HapMap pops indicated.

meansone <- MergedDat[[2]] %>% group_by(pop, super_pop) %>% 
            select(contains("PC")) %>% summarise_all(funs(mean))
## Adding missing grouping variables: `pop`, `super_pop`
plotonesamp <- function(dat1, dat2, x1, y1){
  x2 <- paste("Gen_", x1, sep = "")
  y2 <- paste("Gen_", y1, sep = "")
  colnames(dat2)[which(colnames(dat2) == x2)] <- x1
  colnames(dat2)[which(colnames(dat2) == y2)] <- y1
  p1 <- ggplot(data = dat2, aes_string(x1, y1)) + 
                 geom_point(aes(colour = prediction), alpha = 0.3) +
                  ggrepel::geom_label_repel(data = dat1, aes(label = pop), 
                            alpha = 0.8) + 
          theme_pubr(legend = "none")
  p1 <- ggpar(p1, palette = "jama")
  return(p1)
}
  
p1 <- plotonesamp(meansone, MergedDat[[1]], "PC1", "PC2")
p2 <- plotonesamp(meansone, MergedDat[[1]], "PC3", "PC4")

ggarrange(p1, p2,
          nrow = 1, ncol = 2,
          align = "v")

Finally, let’s take a look at the admixture analysis. Based on visual inspection, these are the approximate cluster by geography associations

Cluster POP
1 NEU
2 EAS
3 SAS
4 SEU
5 AMR
6 AFR
groups <- tibble(IID = rownames(MergedDat[[3]]$Merge1000Gsamples.6.Q))
groups <- groups %>% left_join(MergedDat[[1]], by = "IID") %>% select(IID, prediction)
groups <- groups %>% left_join(MergedDat[[2]], by = "IID") %>% select(IID, prediction, super_pop, pop)
groups <- groups %>%
  mutate(super_pop = ifelse(is.na(prediction), as.character(super_pop), as.character(prediction)),
         sample = ifelse(is.na(prediction), "1000Genomes", "ADAPT"),
         pop = ifelse(is.na(prediction), as.character(pop), as.character(prediction))) %>%
  select(IID, super_pop, pop, sample)

cleanadmix <- MergedDat[[3]]$Merge1000Gsamples.6.Q %>% filter(!is.na(groups$pop))
groups     <- groups %>% filter(!is.na(groups$pop))
cleanadmix <- list("sample1"=cleanadmix)

p1 <- plotQ(cleanadmix, returnplot=T, exportplot=F, quiet=T, basesize=11, showsp=F,
            grplab=as.data.frame(groups[,c(4,2,3)]), ordergrp=TRUE, grplabangle = 90)
plot(p1$plot[[1]])

Face plots

Here we’ll look at some basic 3D plots of average facial morphology looking at the differences between populations and sexes

avgcoords    <- MergedDat[[1]] %>% group_by(prediction, Sex) %>% select(contains("Face_")) %>% 
                    summarise_all(funs(mean))
## Adding missing grouping variables: `prediction`, `Sex`
avglandmarks <- apply(as.matrix(avgcoords[, 3:89] * 1) %*% t(as.matrix(face.eigenvectors)), 1, 
                              function(x) x + t(face.means))

source("Distances.R")
eucdist <- tibble(AFR = rep(0, nrow(face.means)/3), 
                  AMR = rep(0, nrow(face.means)/3), 
                  EAS = rep(0, nrow(face.means)/3),
                  EUR = rep(0, nrow(face.means)/3),
                  SAS = rep(0, nrow(face.means)/3)) 
                      
col <- 1
range01 <- function(x){(x-min(x))/(max(x)-min(x))}
for(i in seq(1, 10, 2)){
  eucdist[, col] <- getDistance(avglandmarks[,i], avglandmarks[,i+1])
  eucdist[, col] <- range01(eucdist[, col])
  col <- col + 1
}

source("PlotFaces.R")

#PlotMultipleFaces(avglandmarks[,1], as.data.frame(face.facets), as.data.frame(eucdist))
PlotComparisonFaces(avglandmarks, as.data.frame(face.facets))
## Loading required package: plotly
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout